source("utils.R")Predict DOC observations from env data.
Train a XGBoost model to predict DOC observations from environmental data used in Wang et al., 2023.
DOC observation data
Read data
Read data, keep columns of interest and rename them. Keep only observations between 0 and 10 meters.
# Read a first time to get column names
columns <- read_excel(
"data/raw/0227166/4.4/data/0-data/All_Basins_Data_Merged_Hansell_2022.xlsx",
n_max = 0
) %>%
colnames() %>%
tolower()
# Read a second time to get the data with the proper column names
df_doc <- read_excel(
"data/raw/0227166/4.4/data/0-data/All_Basins_Data_Merged_Hansell_2022.xlsx",
skip = 2,
col_names = columns
)
# Select useful columns
df_doc <- df_doc %>%
select(
bottle, bottle_flag_w,
lon = longitude, lat = latitude, date,
press = `ctd pressure`,
doc, doc_flag_w
) %>%
mutate(date = ymd(date)) # warning because some dates are missing
# Keep only observations in the 0-10 m range
df_doc <- df_doc %>% filter(between(press, 0, 10))Clean data
Remove aberrant doc observations (flag = 9, negative values…).
df_doc <- df_doc %>%
filter(doc_flag_w != 9) %>%
filter(doc > 0) %>%
drop_na(doc)We have 7036 raw observations in the 0-10 meters depth range.
Distribution in space and time
Map
df_doc %>%
ggplot() +
geom_polygon(data = world, aes(x = lon, y = lat, group = group), fill = "grey") +
geom_point(aes(x = lon, y = lat), size = 0.5) +
coord_quickmap(expand = 0)Depth distribution
ggplot(df_doc) + geom_histogram(aes(x = press), bins = 100)Time distribution
ggplot(df_doc) + geom_histogram(aes(x = date), bins = 100)summary(df_doc$date) Min. 1st Qu. Median Mean 3rd Qu. Max.
"1994-03-01" "2004-06-22" "2010-03-20" "2009-01-25" "2013-08-27" "2021-08-13"
NA's
"28"
Average by pixel across the 0-10 m layer
First, we need to round coordinates to match them with other environmental data from GLODAP, then we can compute the average across the layer.
df_doc <- df_doc %>%
mutate(
# floor and add 0.5 because other env data are on the centre of each pixel
lon = roundp(lon, precision = 1, f = floor) + 0.5,
lat = roundp(lat, precision = 1, f = floor) + 0.5
) %>%
group_by(lon, lat) %>%
summarise(
# compute mean and sd
doc_mean = mean(doc),
doc_sd = sd(doc),
n_obs = n()
) %>%
ungroup()We now have 2961 observations. Let’s plot a map!
# Number of observations
ggmap(df_doc, "n_obs", type = "point") +
ggplot2::scale_colour_viridis_c(option = "cividis", trans = "log10", direction = -1) +
ggtitle("Number of observations")# DOC mean
ggmap(df_doc, "doc_mean", type = "point") +
ggplot2::scale_colour_viridis_c(option = "A", trans = "log10") +
ggtitle("DOC mean")# DOC sd
ggmap(df_doc, "doc_sd", type = "point") +
ggplot2::scale_colour_viridis_c() +
ggtitle("DOC sd")Let’s also have a look at the distribution of DOC values. If we want to predict these, they should be ~normally distributed.
ggplot(df_doc) + geom_histogram(aes(x = doc_mean))ggplot(df_doc) + geom_histogram(aes(x = doc_mean)) + scale_x_continuous(trans = "log")Log-transformation is not perfect but should do!
df_doc <- df_doc %>% mutate(doc_mean_log = log(doc_mean))Other environmental data (predictors)
Load data, and join with DOC data.
load("data/01.all_env.Rdata")
all <- env %>%
select(
lon, lat,
temperature,
phosphate,
silicate,
alkalinity,
dic,
npp,
oxygen
) %>%
drop_na() %>% # drop missing env data
left_join(df_doc, by = join_by(lon, lat)) %>%
drop_na(doc_mean) # drop missing doc data
ggmap(env, var = "phosphate", type = "raster") +
geom_point(data = all, aes(x = lon, y = lat), size = 0.5)ggmap(env, var = "alkalinity", type = "raster") +
geom_point(data = all, aes(x = lon, y = lat), size = 0.5)Our doc-prediction dataset has 2311 observations.
ggplot(all) + geom_histogram(aes(x = doc_mean_log))Predict DOC from env
Prepare data
Define variables roles
# For simplicity
df <- all
# Explanatory variables
exp_vars <- df %>%
select(
temperature,
phosphate,
silicate,
alkalinity,
dic,
npp,
oxygen
) %>%
colnames()
# Metadata variables
meta_vars <- df %>% select(lon, lat) %>% colnames()Split data
# Train VS test, stratified
df_split <- initial_split(df, prop = 0.8, strata = doc_mean_log)
df_train <- training(df_split)
df_test <- testing(df_split)
bind_rows(
df_train %>% mutate(split = "train"),
df_test %>% mutate(split = "test")
) %>%
ggplot() +
geom_density(aes(x = doc_mean_log, colour = split))# Cross-validation, 10 folds, stratified
df_folds <- vfold_cv(df_train, v = 10, strata = doc_mean_log)Train model
Define model
# Define a xgboost model with hyperparameters to tune
xgb_spec <- boost_tree(
trees = tune(),
tree_depth = tune(),
min_n = tune(),
learn_rate = tune()
) %>%
set_mode("regression") %>%
set_engine("xgboost", num.threads = n_cores)
extract_parameter_set_dials(xgb_spec)Collection of 4 parameters for tuning
identifier type object
trees trees nparam[+]
min_n min_n nparam[+]
tree_depth tree_depth nparam[+]
learn_rate learn_rate nparam[+]
Recipe & workflow
# Generate formula from list of explanatory variables
# NB: we also add metadata variables and untransformed "doc_mean" for technical reasons but these will not be used to fit the model
xgb_form <- as.formula(paste("doc_mean_log ~ ", paste(c("doc_mean", meta_vars, exp_vars), collapse = " + "), sep = ""))
xgb_rec <- recipe(xgb_form, data = df_train) %>%
update_role(lon, lat, new_role = "coords") %>% # These variables can be retained in the data but not included in the model
update_role(doc_mean, new_role = "untransformed outcome")
summary(xgb_rec)# A tibble: 11 × 4
variable type role source
<chr> <list> <chr> <chr>
1 doc_mean <chr [2]> untransformed outcome original
2 lon <chr [2]> coords original
3 lat <chr [2]> coords original
4 temperature <chr [2]> predictor original
5 phosphate <chr [2]> predictor original
6 silicate <chr [2]> predictor original
7 alkalinity <chr [2]> predictor original
8 dic <chr [2]> predictor original
9 npp <chr [2]> predictor original
10 oxygen <chr [2]> predictor original
11 doc_mean_log <chr [2]> outcome original
xgb_wflow <- workflow() %>%
add_recipe(xgb_rec) %>%
add_model(xgb_spec)Gridsearch
xgb_grid <- grid_latin_hypercube(
trees(),
learn_rate(),
tree_depth(),
min_n(),
#loss_reduction(),
#sample_size = sample_prop(),
#finalize(mtry(), df_train),
size = 30
)
doParallel::registerDoParallel()
xgb_res <- tune_grid(
xgb_wflow,
resamples = df_folds,
grid = xgb_grid,
control = control_grid(save_pred = TRUE)
)
autoplot(xgb_res)show_best(xgb_res, metric = "rmse")# A tibble: 5 × 10
trees min_n tree_depth learn_rate .metric .estimator mean n std_err
<int> <int> <int> <dbl> <chr> <chr> <dbl> <int> <dbl>
1 501 19 13 0.0495 rmse standard 0.121 10 0.00529
2 1257 6 7 0.00479 rmse standard 0.122 10 0.00513
3 1963 21 14 0.0831 rmse standard 0.122 10 0.00529
4 1451 37 2 0.0234 rmse standard 0.127 10 0.00408
5 1687 33 8 0.00119 rmse standard 0.527 10 0.00486
# ℹ 1 more variable: .config <chr>
best_xgb <- select_best(xgb_res, metric = "rmse")Final fit
final_xgb <- finalize_workflow(
xgb_wflow,
best_xgb
)
final_res <- fit(final_xgb, df_train)[12:30:12] WARNING: src/learner.cc:767:
Parameters: { "num_threads" } are not used.
Evaluate model
Predictions & metrics
test_preds <- augment(final_res, new_data = df_test) %>%
mutate(split = "test", .before = lon) %>% # tag split
rename(.pred_logged = .pred) %>% # prediction is actually log of prediction
mutate(.pred = exp(.pred_logged))
rmse(test_preds, truth = doc_mean_log, estimate = .pred_logged)# A tibble: 1 × 3
.metric .estimator .estimate
<chr> <chr> <dbl>
1 rmse standard 0.133
rsq(test_preds, truth = doc_mean_log, estimate = .pred_logged)# A tibble: 1 × 3
.metric .estimator .estimate
<chr> <chr> <dbl>
1 rsq standard 0.852
Pred VS truth
test_preds %>%
ggplot() +
geom_point(aes(x = doc_mean_log, y = .pred_logged)) +
geom_abline(slope = 1, color = "red") +
labs(title = "Pred VS truth in log-transformed space")test_preds %>%
ggplot() +
geom_point(aes(x = doc_mean, y = .pred)) +
geom_abline(slope = 1, color = "red") +
labs(title = "Pred VS truth")test_preds %>%
ggplot() +
geom_point(aes(x = doc_mean_log, y = .pred_logged, colour = abs(lat))) +
scale_colour_viridis_c() +
geom_abline(slope = 1, color = "red") +
labs(title = "Pred VS truth in log-transformed space")test_preds %>%
ggplot() +
geom_point(aes(x = doc_mean, y = .pred, colour = abs(lat))) +
scale_colour_viridis_c() +
geom_abline(slope = 1, color = "red") +
labs(title = "Pred VS truth")test_preds %>%
ggplot() +
geom_point(aes(x = alkalinity, y = doc_mean, colour = abs(lat))) +
scale_colour_viridis_c() +
labs(title = "Truth VS alkalinity")test_preds %>%
ggplot() +
geom_point(aes(x = alkalinity, y = .pred_logged, colour = abs(lat))) +
scale_colour_viridis_c() +
labs(title = "Pred VS alkalinity")Interpret model
Explainer
Start by generating and explainer.
# Select only predictors
vip_train <- xgb_rec %>% prep() %>% bake(new_data = NULL, all_predictors())
# Explainer
xgb_explain <- explain_tidymodels(
model = extract_fit_parsnip(final_res),
data = vip_train,
y = df_train %>% pull(doc_mean_log)
)Preparation of a new explainer is initiated
-> model label : model_fit ( default )
-> data : 1847 rows 7 cols
-> data : tibble converted into a data.frame
-> target variable : 1847 values
-> predict function : yhat.model_fit will be used ( default )
-> predicted values : No value for predict function target column. ( default )
-> model_info : package parsnip , ver. 1.1.1 , task regression ( default )
-> predicted values : numerical, min = 3.64683 , mean = 4.234957 , max = 6.234434
-> residual function : difference between y and yhat ( default )
-> residuals : numerical, min = -0.2887175 , mean = 3.348189e-06 , max = 0.3489649
A new explainer has been created!
Variable importance
xgb_var_imp <- model_parts(xgb_explain)
ggplot_imp(xgb_var_imp)# Extract most important variables
vars_pdp <- xgb_var_imp %>%
as_tibble() %>%
arrange(desc(dropout_loss)) %>%
filter(variable %in% exp_vars) %>%
group_by(variable) %>%
slice_head(n = 1) %>%
ungroup() %>%
arrange(desc(dropout_loss)) %>%
slice_head(n = n_pdp) %>%
pull(variable)Partial dependence plots
n_pdp <- 3 # only one pdp
plots <- list()
for (i in 1:n_pdp){
var <- vars_pdp[i]
plots[[i]] <- plot_pdp(xgb_explain, var)
}
plots[[1]]
[[2]]
[[3]]